Code

library(tidymodels)
library(themis)
library(tidyverse)
library(skimr)
library(GGally)
library(readr)      
library(vip)

1. download data

Code

hotels <- 
  read_csv('https://tidymodels.org/start/case-study/hotels.csv') %>%
  mutate(across(where(is.character), as.factor))

dim(hotels)

[1] 50000    23

Code

glimpse(hotels)

Rows: 50,000
Columns: 23
$ hotel                          <fct> City_Hotel, City_Hotel, Resort_Hotel, R…
$ lead_time                      <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6…
$ stays_in_weekend_nights        <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, …
$ stays_in_week_nights           <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, …
$ adults                         <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, …
$ children                       <fct> none, none, none, none, none, none, chi…
$ meal                           <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB,…
$ country                        <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP,…
$ market_segment                 <fct> Offline_TA/TO, Direct, Online_TA, Onlin…
$ distribution_channel           <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, …
$ assigned_room_type             <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, …
$ booking_changes                <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ deposit_type                   <fct> No_Deposit, No_Deposit, No_Deposit, No_…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ customer_type                  <fct> Transient-Party, Transient, Transient, …
$ average_daily_rate             <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.…
$ required_car_parking_spaces    <fct> none, none, none, none, none, none, non…
$ total_of_special_requests      <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, …
$ arrival_date                   <date> 2016-09-01, 2017-08-25, 2016-11-19, 20…

2. plot

unbalance data

Code

hotels %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  4038 0.0808
2 none     45962 0.919

Code

skim(hotels)

Data summary
Name	hotels
Number of rows	50000
Number of columns	23
_______________________
Column type frequency:
Date	1
factor	11
numeric	11
________________________
Group variables	None

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
arrival_date	0	1	2015-07-01	2017-08-31	2016-08-29	793

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
hotel	1	FALSE	2	Cit: 30752, Res: 19248
children	1	FALSE	2	non: 45962, chi: 4038
meal	1	FALSE	5	BB: 38316, HB: 6399, SC: 4494, Und: 580
country	1	FALSE	155	PRT: 14046, GBR: 6405, FRA: 5627, ESP: 4298
market_segment	1	FALSE	7	Onl: 23760, Off: 10604, Dir: 7131, Gro: 5124
distribution_channel	1	FALSE	5	TA/: 38349, Dir: 8083, Cor: 3459, GDS: 108
reserved_room_type	1	FALSE	9	A: 34889, D: 8675, E: 3096, F: 1299
assigned_room_type	1	FALSE	10	A: 27357, D: 12577, E: 3924, F: 1839
deposit_type	1	FALSE	3	No_: 49839, Ref: 92, Non: 69
customer_type	1	FALSE	4	Tra: 35343, Tra: 12430, Con: 1864, Gro: 363
required_car_parking_spaces	1	FALSE	2	non: 45019, par: 4981

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
lead_time	1	80.09	91.20	0.00	8.0	45.0	125	709	▇▂▁▁▁
stays_in_weekend_nights	1	0.93	1.00	0.00	0.0	1.0	2	19	▇▁▁▁▁
stays_in_week_nights	1	2.46	1.94	0.00	1.0	2.0	3	50	▇▁▁▁▁
adults	1	1.83	0.51	0.00	2.0	2.0	2	4	▁▂▇▁▁
is_repeated_guest	1	0.04	0.20	0.00	0.0	0.0	0	1	▇▁▁▁▁
previous_cancellations	1	0.02	0.29	0.00	0.0	0.0	0	13	▇▁▁▁▁
previous_bookings_not_canceled	1	0.20	1.80	0.00	0.0	0.0	0	72	▇▁▁▁▁
booking_changes	1	0.29	0.74	0.00	0.0	0.0	0	21	▇▁▁▁▁
days_in_waiting_list	1	1.57	14.79	0.00	0.0	0.0	0	379	▇▁▁▁▁
average_daily_rate	1	99.94	49.04	-6.38	67.5	92.5	125	510	▇▆▁▁▁
total_of_special_requests	1	0.71	0.83	0.00	0.0	1.0	1	5	▇▁▁▁▁

3. data preparation

Code

set.seed(123)
splits      <- initial_split(hotels, strata = children)

hotel_other <- training(splits)
hotel_test  <- testing(splits)

Code

hotel_other %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  3027 0.0807
2 none     34473 0.919

Code

hotel_test  %>% 
  count(children) %>% 
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  1011 0.0809
2 none     11489 0.919

Code

set.seed(234)
val_set <- validation_split(hotel_other, 
                            strata = children, 
                            prop = 0.80)
val_set

# Validation Set Split (0.8/0.2)  using stratification 
# A tibble: 1 × 2
  splits               id        
  <list>               <chr>     
1 <split [30000/7500]> validation

build model

recipe

Code

holidays <- c("AllSouls", "AshWednesday", "ChristmasEve", "Easter", 
              "ChristmasDay", "GoodFriday", "NewYearsDay", "PalmSunday")

lr_recipe <- 
  recipe(children ~ ., data = hotel_other) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date, holidays = holidays) %>% 
  step_rm(arrival_date) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_predictors())

model

Code

lr_mod <- 
  logistic_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

workflow

Code

lr_workflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(lr_recipe)

CREATE THE GRID FOR TUNING

Code

lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))

lr_reg_grid %>% top_n(-5) # lowest penalty values

# A tibble: 5 × 1
   penalty
     <dbl>
1 0.0001  
2 0.000127
3 0.000161
4 0.000204
5 0.000259

Code

lr_reg_grid %>% top_n(5)  # highest penalty values

# A tibble: 5 × 1
  penalty
    <dbl>
1  0.0386
2  0.0489
3  0.0621
4  0.0788
5  0.1

TRAIN AND TUNE THE MODEL

Code

lr_res <- 
  lr_workflow %>% 
  tune_grid(val_set,
            grid = lr_reg_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(roc_auc))

Code

lr_plot <- 
  lr_res %>% 
  collect_metrics() %>% 
  ggplot(aes(x = penalty, y = mean)) + 
  geom_point() + 
  geom_line() + 
  ylab("Area under the ROC Curve") +
  scale_x_log10(labels = scales::label_number())

lr_plot

Code

top_models <-
  lr_res %>% 
  show_best("roc_auc", n = 15) %>% 
  arrange(penalty) 
top_models

# A tibble: 15 × 7
    penalty .metric .estimator  mean     n std_err .config              
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
 1 0.000127 roc_auc binary     0.872     1      NA Preprocessor1_Model02
 2 0.000161 roc_auc binary     0.872     1      NA Preprocessor1_Model03
 3 0.000204 roc_auc binary     0.873     1      NA Preprocessor1_Model04
 4 0.000259 roc_auc binary     0.873     1      NA Preprocessor1_Model05
 5 0.000329 roc_auc binary     0.874     1      NA Preprocessor1_Model06
 6 0.000418 roc_auc binary     0.874     1      NA Preprocessor1_Model07
 7 0.000530 roc_auc binary     0.875     1      NA Preprocessor1_Model08
 8 0.000672 roc_auc binary     0.875     1      NA Preprocessor1_Model09
 9 0.000853 roc_auc binary     0.876     1      NA Preprocessor1_Model10
10 0.00108  roc_auc binary     0.876     1      NA Preprocessor1_Model11
11 0.00137  roc_auc binary     0.876     1      NA Preprocessor1_Model12
12 0.00174  roc_auc binary     0.876     1      NA Preprocessor1_Model13
13 0.00221  roc_auc binary     0.876     1      NA Preprocessor1_Model14
14 0.00281  roc_auc binary     0.875     1      NA Preprocessor1_Model15
15 0.00356  roc_auc binary     0.873     1      NA Preprocessor1_Model16

Code

lr_best <- 
  lr_res %>% 
  collect_metrics() %>% 
  arrange(penalty) %>% 
  slice(12)
lr_best

# A tibble: 1 × 7
  penalty .metric .estimator  mean     n std_err .config              
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1 0.00137 roc_auc binary     0.876     1      NA Preprocessor1_Model12

Code

lr_auc <- 
  lr_res %>% 
  collect_predictions(parameters = lr_best) %>% 
  roc_curve(children, .pred_children) %>% 
  mutate(model = "Logistic Regression")

autoplot(lr_auc)

A SECOND MODEL: TREE-BASED ENSEMBLE

Code

cores <- parallel::detectCores()
cores

[1] 4

Code

rf_mod <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger", num.threads = cores) %>% 
  set_mode("classification")

Code

rf_recipe <- 
  recipe(children ~ ., data = hotel_other) %>% 
  step_date(arrival_date) %>% 
  step_holiday(arrival_date) %>% 
  step_rm(arrival_date)

Code

rf_workflow <- 
  workflow() %>% 
  add_model(rf_mod) %>% 
  add_recipe(rf_recipe)

Code

rf_mod

Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = 1000
  min_n = tune()

Engine-Specific Arguments:
  num.threads = cores

Computational engine: ranger

Code

extract_parameter_set_dials(rf_mod)

Collection of 2 parameters for tuning

 identifier  type    object
       mtry  mtry nparam[?]
      min_n min_n nparam[+]

Model parameters needing finalization:
   # Randomly Selected Predictors ('mtry')

See `?dials::finalize` or `?dials::update.parameters` for more information.

Code

set.seed(345)
rf_res <- 
  rf_workflow %>% 
  tune_grid(val_set,
            grid = 3,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(roc_auc))
#> i Creating pre-processing data to finalize unknown parameter: mtry

Code

rf_res %>% 
  show_best(metric = "roc_auc")

# A tibble: 3 × 8
   mtry min_n .metric .estimator  mean     n std_err .config             
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1    18    29 roc_auc binary     0.922     1      NA Preprocessor1_Model1
2    24     8 roc_auc binary     0.921     1      NA Preprocessor1_Model3
3     1    18 roc_auc binary     0.871     1      NA Preprocessor1_Model2

Code

autoplot(rf_res)

Code

rf_best <- 
  rf_res %>% 
  select_best(metric = "roc_auc")
rf_best

# A tibble: 1 × 3
   mtry min_n .config             
  <int> <int> <chr>               
1    18    29 Preprocessor1_Model1

Code

rf_res %>% 
  collect_predictions()

# A tibble: 22,500 × 8
   id         .pred_children .pred_none  .row  mtry min_n children .config      
   <chr>               <dbl>      <dbl> <int> <int> <int> <fct>    <chr>        
 1 validation       0.187         0.813    13    18    29 none     Preprocessor…
 2 validation       0.0381        0.962    20    18    29 none     Preprocessor…
 3 validation       0.470         0.530    22    18    29 children Preprocessor…
 4 validation       0.00824       0.992    23    18    29 none     Preprocessor…
 5 validation       0.00592       0.994    31    18    29 none     Preprocessor…
 6 validation       0.000445      1.00     38    18    29 none     Preprocessor…
 7 validation       0             1        39    18    29 none     Preprocessor…
 8 validation       0.00676       0.993    50    18    29 none     Preprocessor…
 9 validation       0.0346        0.965    54    18    29 none     Preprocessor…
10 validation       0.0340        0.966    57    18    29 children Preprocessor…
# … with 22,490 more rows

Code

rf_auc <- 
  rf_res %>% 
  collect_predictions(parameters = rf_best) %>% 
  roc_curve(children, .pred_children) %>% 
  mutate(model = "Random Forest")

Code

bind_rows(rf_auc, lr_auc) %>% 
  ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) + 
  geom_path(linewidth = 1.5, alpha = 0.8) +
  geom_abline(lty = 3) + 
  coord_equal() + 
  scale_color_viridis_d(option = "plasma", end = .6)

last fit

Code

last_rf_mod <- 
  rand_forest(mtry = 8, min_n = 7, trees = 1000) %>% 
  set_engine("ranger", num.threads = cores, importance = "impurity") %>% 
  set_mode("classification")

# the last workflow
last_rf_workflow <- 
  rf_workflow %>% 
  update_model(last_rf_mod)

# the last fit
set.seed(345)
last_rf_fit <- 
  last_rf_workflow %>% 
  last_fit(splits)

last_rf_fit

# Resampling results
# Manual resampling 
# A tibble: 1 × 6
  splits                id               .metrics .notes   .predict…¹ .workflow 
  <list>                <chr>            <list>   <list>   <list>     <list>    
1 <split [37500/12500]> train/test split <tibble> <tibble> <tibble>   <workflow>
# … with abbreviated variable name ¹.predictions

Code

last_rf_fit %>% 
  collect_metrics()

# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.946 Preprocessor1_Model1
2 roc_auc  binary         0.924 Preprocessor1_Model1

Code

last_rf_fit %>% 
  extract_fit_parsnip() %>% 
  vip(num_features = 20)

Code

last_rf_fit %>% 
  collect_predictions() %>% 
  roc_curve(children, .pred_children) %>% 
  autoplot()

Reference

https://www.youtube.com/watch?v=dbXDkEEuvCU

https://juliasilge.com/blog/hotels-recipes/